home *** CD-ROM | disk | FTP | other *** search
- /* index.c
- a program to prepare an ordered list of "words" and their frequencies.
- a "word" is defined as a string of alphabetic characters (A-Z, a-z)
- or digits (0-9), freely intermixed. */
-
- #include <stdio.h>
- #include <ctype.h>
-
- #define YES 1
- #define NO 0
- #define WORDLEN 20
- #define LETTER 0
- #define NUMBER 1
- #define OTHER 2
- #define KILOBYTE 1024
-
- char buffer[BUFSIZ];
-
- typedef struct list
- {
- char *wd; /*text for a unique word */
- int count; /* count the occurences of the word*/
- struct list *left; /* left pointer*/
- struct list *right; /* right pointer*/
- } LIST, *LISTPTR ;
- int nwords;
- int n_unique_words;
-
- main (argc, argv) /* index.c */
-
- int argc;
- char *argv[];
-
- {
- LISTPTR root;
- LISTPTR countword();
- char word[WORDLEN];
- int c, /* individual character */
- j, /* counter for word - output */
- i, /* counter for buffer - input */
- nchar, /* number of characters read from text file */
- inword, /* flag - indicates whether we are in a word */
- accpt; /* return value of accept(): indicates whether
- a character is "in" a word. These include
- characters, (single) hyphens, (single)
- apostrophies, but not numbers and other ascii
- values */
- int bufno = 0;
-
- FILE *fopen(),
- *fp;
- FILE *fpout;
- int fread();
- char out_file[13];
-
-
- /*search the command line for a file to read: exit if none */
-
- if (argc == 1)
- { printf ("no file to index");
- exit (1);
- }
-
- else
- {
- while (--argc > 0)
- if ((fp = fopen(*++argv, "r")) == NULL)
- { printf("index: can't open %s\n",*argv);
- break;
- }
- strcpy(out_file,*argv);
- for (i=0; i<9 && out_file[i] != '.'; ++i)
- ;
- out_file[++i] = NULL;
- strcat (out_file, "txt"); /* works */
- printf("OUTPUT file: %s\n",out_file);
- fpout = fopen(out_file, "wb"); /*open output file*/
- if (fpout == NULL)
- {
- printf ("index: cannot open output file");
- exit(1);
- }
- printf("fpout = %d\n",fpout);
- }
-
- inword = NO;
- root = NULL;
- n_unique_words = c = j = 0;
-
- while ( nchar = fread (buffer, 1, BUFSIZ , fp) )
- /* process until no characters returned (EOF)*/
-
- {
- /* printf("nchar = %d, bufno = %d\n",nchar,++bufno);*/
- /* printf("buffer contents %s\n",buffer);*/
-
- for (i=0; i < nchar ; ++i) /*nchar is a count. nchar-1 is last valid
- index */
- {
- if ((accpt = accept( c = buffer[i])) != LETTER)
- {
- if (inword == YES)
- {
- word[j] = '\0';
- /* printf("%s \n",word);*/
- if (validword(word) )
- root = countword (root,word,j+1);
- inword = NO;
- j = 0;
- }
- /* if(c == '\n')
- inword = NO;*/
- }
- else if (accpt == LETTER )
- {
- if (j < WORDLEN) word[j++] = tolower (c);
- inword = YES;
- }
- }
- /* printf("finished running the buffer \n");*/
- /* printf("feof(fp) %d, inword %d\n",feof(fp),inword);*/
-
- if (feof(fp) && inword == YES)
- {
- word[j] = '\0';
- if (validword(word) )
- root = countword (root, word,j+1);
- }
- }
- printf("completed reading file -- output ?follows?\n");
-
- sprintf (buffer,"total words %4d",n_unique_words); /*write total*/
- strcat (buffer, "\r\n"); /*mandatory control chars*/
- fwrite (buffer, 1, strlen(buffer), fpout);
- treeprint(root,fpout); /*call recursive print*/
-
- } /*end of program */
-
- /* RECURSIVE routine to build a tree of words and their frequences*/
- /* the final return address is the address of the ROOT node */
-
- struct list *countword (p, word, wd_len) /*install word at or below p */
- struct list *p; /*count word if already in tree*/
- char word[];
- int wd_len; /* including the terminating NULL */
- {
- static char *wd_buffer = NULL,
- *wd_buffer_top = NULL;
- char *malloc();
- int cond;
-
- /* printf(".");*/
-
- /* printf("countword: %s len= %d\n",word,wd_len);*/
-
- if (p == NULL) /*new word - make a new node */
- {
- p = (LISTPTR ) malloc (sizeof(LIST)); /*allocade a node*/
- if (p == NULL)
- {
- /*error handling */
- ;
- }
- n_unique_words += 1; /* count the new word */
-
- /*====make sure there is enough room to store a new word. If not allocate space*/
-
- if ((wd_buffer_top - wd_buffer) < wd_len) /*how much space remains?*/
- {
- wd_buffer = (char *) malloc(KILOBYTE); /*allocate a chunk*/
- if (wd_buffer == NULL) /*if no space available, take action*/
- {
- /* error handling */
- ;
- }
- wd_buffer_top = wd_buffer + KILOBYTE ; /* top of chunk (+1) */
- }
- p -> wd = wd_buffer; /* pointer to the new word*/
- while (*wd_buffer++ = *word++) /*copy the word /*
- ; /*to a home */
- p -> count = 1; /* count it */
- p -> left = p->right = NULL; /*null the child pointers*/
- }
- else if ((cond = strcmp(word, p->wd)) == 0)
- p->count++; /*repeated word*/
-
- else if (cond < 0) /*lower, go left*/
- p->left = countword (p->left,word,wd_len);
-
- else /*greater, go right*/
- p->right = countword (p->right,word,wd_len);
-
- return(p);
- }
-
-
- strcmp (s, t)
- char s[];
- char t[];
- {
- int diff;
-
- while ((diff = *s-*t) == 0 && *t != '\0' && *s != '\0')
- {
- ++s;
- ++t;
- }
-
- return (diff);
- }
-
- treeprint(p,fpout)
- struct list *p;
- FILE *fpout;
- {
-
- if (p!=NULL)
- {
- treeprint(p->left,fpout);
- sprintf(buffer,"%4d %s",p->count,p->wd);
- strcat( buffer, "\r\n");
- fwrite (buffer, 1, strlen(buffer), fpout);
- treeprint(p->right,fpout);
- }
-
- }
-
-
- accept (c)
- char c;
- {
-
- if (isalpha(c) || c == '\'') return (LETTER);
- if (isdigit(c)) return (NUMBER);
-
- return (OTHER);
- }
-
- validword (w)
- char w[];
- {
- int i;
-
- if (w[0] == '-' ) return(0); /*initial hyphens are invalid */
-
- for (i=0; i<WORDLEN-1 && w[i] !=NULL; ++i)
- {
- /* if (w[i] == '-' && w[i+1] == '-') return(0); */
- /* double embedded '-'is invalid */
- if(w[i] == '\'' && w[i+1] == '\'') return(0);
- /*double apostrophie is invalid */
- }
- return (1);
- }
-
-